library(tidyverse)
library(bigrquery)
library(wesanderson)
library(plotly)
library(hrbrthemes)
library(patchwork)
library(flextable)
library(ggstatsplot)
library(GGally)
library(viridis)
library(lubridate)
library(geosphere)

project_id <- "tc-da-1"
dataset_id <- "olist_db"

con <- dbConnect(
  bigrquery::bigquery(),
  project = project_id,
  dataset = dataset_id
)

# to close the connection: dbDisconnect(con)

select
  price, freight_value,
  round(freight_value / (freight_value + price),2) as freight_ratio,
from
  `olist_db.olist_order_items_dataset`

Distribution of freight ratio

df <- df_freight_ratio

# # Simple histogram - refined below by calculating the values manually
# hist(df$freight_ratio, main = "Number of order in different ranges of shipping cost",
#           xlab = "freight ratio = freight cost / (price + freight cost)",
#           ylab = "number of orders", col = "steelblue", border = "white")
# abline(v = 0.19, col = "red")
# text(x = 0.19, y = par("usr")[4] - 2500, labels = "Median = 0.19", pos = 4, col = "red")


# Calculate revenue and number of orders - as well as their proportion and 
# cumulative sum - for different segments of freight ratio
fr_bin_value <- 0.05

ddf <- df %>%
  mutate(fr_bin = cut(freight_ratio, breaks = seq(0, 1, by = fr_bin_value)) ) %>%
  group_by(fr_bin) %>% 
  mutate(fr_bin = max(freight_ratio) ) %>% 
  reframe(
    revenue = sum(price),
    n_orders = n(),
    prop_orders = n() / nrow(df),
    prop_total_revenue = sum(price) / sum(df$price)
  ) %>%
  na.omit() %>% 
  arrange(fr_bin) %>% 
  mutate(
    cum_revenue = cumsum(revenue),
    cum_prop_revenue = cumsum(prop_total_revenue)
  )



scale_dual_axis = 0.2

ddf %>% 
  ggplot(aes(x = fr_bin)) +
  geom_bar(aes(y = prop_orders), stat = "identity", fill = "steelblue", color = "white", width = fr_bin_value) +
  geom_line(aes(y = cum_prop_revenue*scale_dual_axis), color = "red") +
  scale_y_continuous(
    sec.axis = sec_axis(~./scale_dual_axis, name = "% Cumulative Revenue", breaks = seq(0, 1, 0.1))
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 1, fr_bin_value)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_cartesian(xlim = c(0, 0.8)) +
  labs(
    title = "Orders for different segments of Freight Ratio", 
    x = "Freight ratio segments",
    y = "Proportion of total Orders"
  ) + theme(
    axis.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

df %>% 
 mutate(price_qtile = ntile(price, 4)) %>% 
 group_by(price_qtile) %>% 
 mutate(price_range = paste0(min(price),"-",max(price)) %>% factor) %>% 
ggplot(aes(x = freight_ratio, fill = as.factor(price_range)) ) +
  geom_histogram(binwidth = 0.02, alpha = 1) +
  # stat_bin(aes(y=..count../sum(..count..)),geom="step", binwidth = 0.02) +
  # geom_density(alpha = 0.7) +
  # facet_wrap(~ price_range, ncol = 2) +
  labs(title = "Histogram of Freight Ratio by Price Range",
       x = "Freight Ratio", 
       y = "Number of Orders",
       fill = "Segments of orders \nby value (quartiles)") +
  theme_minimal() +
  scale_fill_manual(values = wesanderson::wes_palette("Zissou1")) +
  theme(legend.position = c(0.7,0.7))

# Freight ratio in 4 categories of products of increasing price
df %>% 
 mutate(price_qtile = ntile(price, 4)) %>% 
 group_by(price_qtile) %>% 
 mutate(price_range = paste0(min(price),"-",max(price)) %>% factor) %>% 
 ggplot(aes(x = price_range, y = freight_ratio)) +
 geom_boxplot(color = "steelblue") +
 stat_summary(
    fun.data = function(y) data.frame(y = median(y), label = median(y)),
    geom = "text", hjust = 4, vjust = 0.5, color = "black", size = 3
  ) +
 theme_minimal() +
 labs(
   title = "Freight ratio for orders of different value",
   x = "Price range (quartiles)",
   y = "Freight ratio" 
 )

# Freight ratio decrease very quick with order value
df %>% 
 ggplot(aes(x = price, y = freight_ratio)) +
 geom_point(alpha = 0.1, color = "steelblue") +
 # theme_minimal() +
 theme_minimal() +
 scale_x_continuous(
  trans = "log", 
  labels = function(x) round(x,2),
  breaks = 2^(0:12)
 ) + 
  labs(
    title = "Very high shipping fees for orders < 100 BR$",
    x = "Order value (log)",
    y = "Freight ratio"
  )

# # Alternative representation: 2D histogram (to highlight counts)
# df %>% 
#   # filter(freight_ratio >= 0.2) %>%
#   ggplot(aes(x = freight_ratio, y = price)) +
#   geom_hex(bins = 30) +
#   scale_fill_continuous(type = "viridis", trans = "log", breaks = 10^seq(4)) +
#   scale_y_continuous(trans = "log", breaks = 10^seq(3) ) +
#   scale_x_continuous(breaks = seq(0,1,0.1)) +
#   theme_minimal() +
#   coord_fixed(ratio = 0.1) +
#   theme(
#     axis.text.x = element_text(size = 12),
#     axis.text.y = element_text(size = 12),
#     panel.grid.minor = element_blank()
#   )


# # Original (not log transformed) values
# df %>%
#  filter(freight_ratio > 0, price < 1500) %>%
#  sample_frac(0.1) %>%
#  ggplot(aes(x = price, y = freight_ratio)) +
#  geom_point(alpha = 0.1, color = "steelblue") +
#  theme_minimal()

Effect of lowering the freight ratio

Preliminary check

Revenues after lowering FR of a given percent

pct_decrease_fr <- 0.05

ddf <- df %>% 
  rename(fr = freight_ratio) %>%
  filter(fr > 0) %>%
  mutate(lower_fr = round(fr * (1 - pct_decrease_fr),2) ) %>% 
  add_count(fr, name = "cnt_fr") %>% 
  add_count(lower_fr, name = "cnt_lower_fr") %>% 
  mutate(order_increase = cnt_lower_fr / cnt_fr) %>% 
  group_by(fr) %>% 
  reframe(
    AOV = sum(price) / n(),
    current_revenue = round(cnt_fr * AOV,2) %>% unique,
    new_revenue = round(cnt_lower_fr * AOV,2) %>% unique
  ) %>%
  # express in deciles instead of percentiles of freight_ratio
  mutate(ntile_fr = ntile(fr,10)) %>%
  group_by(ntile_fr) %>% 
  mutate(fr_range = paste(min(fr)*100,"-", max(fr)*100,"%")) %>% 
  group_by(fr_range) %>% 
  reframe(
    current_revenue = sum(current_revenue),
    new_revenue = sum(new_revenue)
  ) 

pct_increase_revenue <- round((sum(ddf$new_revenue) / sum(ddf$current_revenue) - 1) * 100,2)

ddf %>% 
  pivot_longer(cols = c("current_revenue","new_revenue"), names_to = "revenue") %>% 
  ggplot(aes(x = fr_range, y = value, fill = revenue)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels
        axis.title.y = element_text(angle = 90)) +
  scale_y_continuous(labels = scales::comma_format()) +
  labs(
    title = paste0("Expected increase in revenue: ",pct_increase_revenue,"%"),
    subtitle = paste0("when freight ratio is lowered by ",pct_decrease_fr*100,"%"),
    x = "Freight ratio segment", y = "revenue per segment"
  ) +
  theme(
    legend.position = c(0.7,0.7),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

Profits after lowering FR

Olist generally takes a cut of 10% on the price of the item (shipping fees excluded).

pct_decrease_fr <- 0.05

ddf <- df %>% 
  rename(fr = freight_ratio) %>%
  filter(fr > 0) %>%
  mutate(freight_discount = freight_value * pct_decrease_fr) %>% 
  mutate(lower_fr = round(fr * (1 - pct_decrease_fr),2) ) %>% 
  add_count(fr, name = "cnt_fr") %>% 
  add_count(lower_fr, name = "cnt_lower_fr") %>% 
  mutate(order_increase = cnt_lower_fr / cnt_fr) %>% 
  group_by(fr) %>% 
  reframe(
    AOV = sum(price) / n(),
    AO_freight_discount = sum(freight_discount) / n(),
    current_revenue = round(cnt_fr * AOV,2) %>% unique,
    new_revenue = round(cnt_lower_fr * AOV,2) %>% unique,
    freight_loss = round(cnt_lower_fr * AO_freight_discount,2) %>% unique
  ) %>%
  # express in deciles instead of percentiles of freight_ratio
  mutate(ntile_fr = ntile(fr,10)) %>%
  group_by(ntile_fr) %>% 
  mutate(fr_range = paste(min(fr)*100,"-", max(fr)*100,"%")) %>% 
  group_by(fr_range) %>% 
  reframe(
    current_revenue = sum(current_revenue),
    new_revenue = sum(new_revenue),
    total_freight_loss = sum(freight_loss)
  )


# Applying the freight discount on all freight ratio segments
ddf %>% 
  mutate(
    current_profit = current_revenue * 0.10,
    new_profit = (new_revenue * 0.10) - total_freight_loss
  ) %>% 
  summarise(
    pct_increase_profit = (sum(new_profit) - sum(current_profit)) / sum(current_profit) * 100,
    profit_increase = sum(current_profit) * pct_increase_profit / 100
  ) %>% flextable() %>% 
  add_header_row(values = c("Apply discount to all segments",""))
# Applying the freight discount only to the 10-18% freight ratio
ddf %>% 
  mutate(
    current_profit = current_revenue * 0.10,
    new_profit = ifelse(
      fr_range == "10 - 18 %", new_revenue * 0.10, current_revenue * 0.10 
    )
  ) %>% 
  summarise(
    pct_increase_profit = (sum(new_profit) - sum(current_profit)) / sum(current_profit) * 100,
    profit_increase = sum(current_profit) * pct_increase_profit / 100
  ) %>% flextable() %>% 
  add_header_row(values = c("Apply discount to 10-18% fr segment",""))

Freight ratio and stars

There is actually no difference in the reviews for sellers with different freight ratio


select
  t1.order_id,
  round(t1.freight_value / (t1.freight_value + t1.price),2) as freight_ratio,
  review_score as n_stars
from `olist_db.olist_order_items_dataset` t1 join `olist_db.olist_order_reviews_dataset` t2
  on t1.order_id = t2.order_id
# df_FR_stars %>% 
#   mutate(qtile_FR = ntile(freight_ratio, 4)) %>%
#   group_by(qtile_FR, n_stars) %>% 
#   reframe(
#     cnt = n()
#   ) %>% 
#   ungroup %>% group_by(qtile_FR) %>%
#   reframe(
#     n_stars = unique(n_stars),
#     prop = cnt / sum(cnt)
#   ) 

ddf_stars <- df_FR_stars %>% 
  mutate(qtile_FR = ntile(freight_ratio, 4)) %>% 
  add_count(qtile_FR, name = "n_orders") %>% 
  group_by(qtile_FR, n_stars) %>% 
  reframe(
    prop = unique(n() / n_orders)
  )


# # Quartiles of FR 
# ddf_stars <- df_FR_stars %>% 
#   mutate(qtile_FR = ntile(freight_ratio, 4))
# 
# ggbarstats(
#   y = qtile_FR,
#   x = n_stars,
#   data = ddf_stars,
#   results.subtitle = F,
#   label = "percent",
#   caption = F
# )

# High and low FR
df_FR_stars %>% 
  mutate(FR_segment = ifelse(freight_ratio <= 0.2, "low","high")) %>% 
  ggbarstats(
    y = FR_segment,
    x = n_stars,
    results.subtitle = F,
    label = "percent"
  ) +
  # scale_fill_viridis_d()
  scale_fill_brewer(palette = "Dark1", direction = -1) + # Set1,2,3, Dark1,2,3, Paired, Accent, Pastel1,2, 
  theme(axis.text.x = element_text(size = 12)) +
  labs(
    title = "Number of Stars in Reviews for high and low freight ratio",
    x = "Freight ratio"
  )

Distance between customer and seller

Get the data using SQL

/* Distance between seller and customer */

-- NB the lat/lng for each zipcode are averaged

with 
geo as
(
  select
    distinct(geolocation_zip_code_prefix) zipcode,
    avg(geolocation_lat) as lat,
    avg(geolocation_lng) as lng,
  from `olist_db.olist_geolocation_dataset`
  group by geolocation_zip_code_prefix
  order by
    geolocation_zip_code_prefix
),

seller_info as 
(
  select
    distinct(sellers.seller_id),
    seller_zip_code_prefix as seller_zipcode,
    geo.lat as seller_lat, 
    geo.lng as seller_lng,
    seller_city,
    seller_state,
    order_id,
    price,
    freight_value,
    product_weight_g,
    product_length_cm * product_height_cm * product_width_cm as product_volume_cm3
  from `olist_db.olist_sellers_dataset` sellers join `olist_db.olist_order_items_dataset` items
    on sellers.seller_id = items.seller_id
  join geo on sellers.seller_zip_code_prefix = geo.zipcode
  join `olist_db.olist_products_dataset` products on items.product_id = products.product_id
),
customer_info as
(
  select
    distinct(cust.customer_id),
    customer_zip_code_prefix as customer_zipcode,
    customer_city,
    customer_state,
    order_id,
    date(order_purchase_timestamp) as date_purchased,
    date(order_delivered_customer_date) as date_delivered,
    geo.lat as customer_lat,
    geo.lng as customer_lng
  from 
    `olist_db.olist_customesr_dataset` cust join `olist_db.olist_orders_dataset` orders
      on cust.customer_id = orders.customer_id
    join geo on cust.customer_zip_code_prefix = geo.zipcode
)
select 
  *
from seller_info join customer_info
  on seller_info.order_id = customer_info.order_id
;

Calculate distance in km between seller and customer

library(lubridate)

df <- df_sql %>% 
  select(-order_id_1) %>%
  mutate(fr = freight_value / (price + freight_value)) %>%  # calculate freigth_ratio 
  mutate(fr_segment = ifelse(fr <= 0.2, "low","high")) %>%  # set high and low fr (<>0.2)
  mutate(days_to_deliver = interval(date_purchased, date_delivered) %>% as.numeric()  / (60 * 60 * 24)) %>% 
  filter(!is.na(days_to_deliver))

# df %>% glimpse

library(geosphere)

# Function to calculate distance using distGeo
# s_lng/lat = seller; c_lng/lat = customer
calculate_distance <- function(s_lng, s_lat, c_lng, c_lat) {
  distm(
    matrix(c(s_lng, s_lat), ncol = 2), 
    matrix(c(c_lng, c_lat), ncol = 2), 
    fun = distGeo
  )
}

D_vector <- list(
  df$seller_lng, df$seller_lat, 
  df$customer_lng, df$customer_lat
) %>% 
  pmap_dbl(calculate_distance, .progress = TRUE)
##  â– â– â– â– â–                              13% |  ETA: 16s
##  â– â– â– â– â– â– â– â– â– â–                         29% |  ETA: 13s
##  â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–                    45% |  ETA: 10s
##  â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–                61% |  ETA:  7s
##  â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–           77% |  ETA:  4s
##  â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â–      93% |  ETA:  1s
df$distance_km <- round(D_vector/1000)

df$distance_km[df$distance_km < 1000] %>% hist

Limit distance to 2000 km and days to deliver to 40

df %>% select_if(is.numeric) %>%  summary
##  seller_zipcode    seller_lat        seller_lng         price        
##  Min.   : 1001   Min.   :-32.079   Min.   :-63.89   Min.   :   0.85  
##  1st Qu.: 6506   1st Qu.:-23.613   1st Qu.:-48.83   1st Qu.:  40.00  
##  Median :13660   Median :-23.419   Median :-46.76   Median :  78.00  
##  Mean   :24616   Mean   :-22.795   Mean   :-47.24   Mean   : 123.92  
##  3rd Qu.:29154   3rd Qu.:-21.757   3rd Qu.:-46.52   3rd Qu.: 139.00  
##  Max.   :99730   Max.   : -2.501   Max.   :-34.86   Max.   :6735.00  
##                                                                      
##  freight_value    product_weight_g product_volume_cm3 customer_zipcode
##  Min.   :  0.00   Min.   :    0    Min.   :   168     Min.   : 1003   
##  1st Qu.: 13.12   1st Qu.:  300    1st Qu.:  2816     1st Qu.:11250   
##  Median : 16.32   Median :  700    Median :  6358     Median :24350   
##  Mean   : 20.07   Mean   : 2087    Mean   : 15130     Mean   :35057   
##  3rd Qu.: 21.19   3rd Qu.: 1800    3rd Qu.: 18200     3rd Qu.:58415   
##  Max.   :409.68   Max.   :40425    Max.   :296208     Max.   :99980   
##                   NA's   :16       NA's   :16                         
##   customer_lat     customer_lng           fr         days_to_deliver 
##  Min.   :-33.69   Min.   :-72.669   Min.   :0.0000   Min.   :  0.00  
##  1st Qu.:-23.59   1st Qu.:-48.110   1st Qu.:0.1162   1st Qu.:  7.00  
##  Median :-22.92   Median :-46.633   Median :0.1833   Median : 10.00  
##  Mean   :-21.21   Mean   :-46.192   Mean   :0.2093   Mean   : 12.43  
##  3rd Qu.:-20.14   3rd Qu.:-43.635   3rd Qu.:0.2771   3rd Qu.: 16.00  
##  Max.   : 42.18   Max.   : -8.724   Max.   :0.9633   Max.   :210.00  
##                                                                      
##   distance_km    
##  Min.   :   0.0  
##  1st Qu.: 187.2  
##  Median : 433.0  
##  Mean   : 598.4  
##  3rd Qu.: 796.0  
##  Max.   :8652.0  
## 
describe <- function(column, take_log = 0) {
  par(mfrow = c(1,2))
  var <- df[[column]]
  if(take_log == 1) {var = log(var)}
  boxplot(var, main = column)
  hist(var)
}


describe("days_to_deliver", 0)

df %>% glimpse()
## Rows: 99,478
## Columns: 23
## $ seller_id          <chr> "0cbcee27c791afa0cdcb08587a2013a8", "9bcdfa7b615b3a…
## $ seller_zipcode     <int> 37410, 89053, 87502, 24020, 29156, 85863, 37564, 93…
## $ seller_lat         <dbl> -21.69337, -26.87730, -23.75942, -22.89370, -20.278…
## $ seller_lng         <dbl> -45.25977, -49.08145, -53.29278, -43.12042, -40.411…
## $ seller_city        <chr> "tres coracoes", "blumenau", "umuarama", "niteroi",…
## $ seller_state       <chr> "MG", "SC", "PR", "RJ", "ES", "PR", "MG", "RS", "SC…
## $ order_id           <chr> "2c45c33d2f9cb8ff8b1c86cc28c11c30", "c158e9806f85a3…
## $ price              <dbl> 135.00, 79.90, 279.90, 689.90, 84.90, 279.00, 119.9…
## $ freight_value      <dbl> 18.51, 21.01, 16.13, 98.81, 36.14, 16.12, 17.58, 30…
## $ product_weight_g   <int> 2250, 300, 500, 22350, 8325, 3800, 1350, 300, 50, 1…
## $ product_volume_cm3 <int> 6600, 3500, 3600, 228480, 19866, 55352, 14350, 2160…
## $ customer_id        <chr> "de4caa97afa80c8eeac2ff4c8da5b72e", "25456ee3b0cf84…
## $ customer_zipcode   <int> 88058, 74475, 3380, 37142, 18078, 13224, 28740, 638…
## $ customer_city      <chr> "florianopolis", "goiania", "sao paulo", "divisa no…
## $ customer_state     <chr> "SC", "GO", "SP", "MG", "SP", "SP", "RJ", "CE", "SP…
## $ date_purchased     <date> 2016-10-09, 2017-04-14, 2017-04-26, 2017-04-22, 20…
## $ date_delivered     <date> 2016-11-09, 2017-05-08, 2017-05-08, 2017-05-12, 20…
## $ customer_lat       <dbl> -27.442895, -16.611184, -23.576068, -21.509981, -23…
## $ customer_lng       <dbl> -48.40148, -49.32568, -46.53257, -46.19579, -47.467…
## $ fr                 <dbl> 0.120578464, 0.208205331, 0.054487721, 0.125280521,…
## $ fr_segment         <chr> "low", "high", "low", "low", "high", "low", "low", …
## $ days_to_deliver    <dbl> 31, 24, 12, 20, 9, 15, 13, 44, 14, 18, 15, 41, 17, …
## $ distance_km        <dbl> 712, 1137, 690, 352, 809, 825, 444, 2980, 436, 808,…
df <- df %>% 
  filter(days_to_deliver <= 40) %>%  # filter all orders that took more than 2 months to be delivered
  filter(distance_km < 2000)

Association between distance and delivery time

With relation to freight ratio

plot(distance_km ~ days_to_deliver, data = df %>% sample_frac(0.01))

boxplot(df$days_to_deliver)

boxplot(df$distance_km)

fit <- lm(days_to_deliver ~ poly(distance_km,2), data = df)
summary(fit)
## 
## Call:
## lm(formula = days_to_deliver ~ poly(distance_km, 2), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.413  -4.154  -1.546   2.564  32.942 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             11.28095    0.02063  546.75   <2e-16 ***
## poly(distance_km, 2)1  899.71365    6.27180  143.45   <2e-16 ***
## poly(distance_km, 2)2 -307.22799    6.27180  -48.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.272 on 92395 degrees of freedom
## Multiple R-squared:  0.1992, Adjusted R-squared:  0.1991 
## F-statistic: 1.149e+04 on 2 and 92395 DF,  p-value: < 2.2e-16
# # Regular scatterplot. The correlation is invisible
# df %>% 
#   sample_frac(0.5) %>% 
#   select(distance_km, days_to_deliver) %>% 
# ggplot(aes(x = distance_km, y = days_to_deliver)) +
#   geom_point(color = "steelblue", alpha = 0.7) +
#   theme_minimal()


# 2D histogram. The correlation becomes evident
# See
# https://stackoverflow.com/questions/54092169/how-to-plot-ggplots-hexagon-only-if-number-greater-than-threshold
df %>% 
  sample_frac(1) %>% 
  # filter(fr_segment == "high") %>% 
  select(distance_km, days_to_deliver, fr_segment) %>% 
  mutate(distance_bin = cut(distance_km, breaks = seq(0, 2500, by = 10))) %>% 
  group_by(distance_bin, fr_segment) %>%
  mutate(distance_bin = max(distance_km)) %>%
ggplot(aes(x = distance_bin, y = days_to_deliver, 
           fill = cut(..count.., 2^seq(4,15)) )) +
  geom_hex(bins = 30) +
  scale_fill_viridis_d(labels = list(2^seq(4,11),2^seq(5,12)) %>% pmap(~ paste0(.x, "-", .y))) +
  # scale_fill_continuous(type = "viridis", trans = "log", breaks = 2^seq(15)) +
  theme_minimal() +
  coord_fixed(ratio = 2000 / 40) +
  labs(
    title = "Number of orders delivered in N days for each distance",
    subtitle = "by freight ratio segment",
    x = "Distance (km)",
    y = "Days to deliver",
    fill = "Number of\nOrders"
  ) +
  facet_wrap(vars(fr_segment), ncol = 2) +
  theme(strip.text = element_text(size = 12))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# # Continous version - difficult to threshold
# df %>% 
#   sample_frac(1) %>% 
#   select(distance_km, days_to_deliver) %>% 
#   mutate(distance_bin = cut(distance_km, breaks = seq(0, 2500, by = 10))) %>% 
#   group_by(distance_bin) %>%
#   mutate(distance_bin = max(distance_km)) %>%
# ggplot(aes(x = distance_bin, y = days_to_deliver )) +
#   geom_hex(bins = 30) +
#   scale_fill_continuous(type = "viridis", trans = "log", breaks = 2^seq(15)) +
#   theme_minimal() +
#   coord_fixed(ratio = 50) +
#   labs(
#     title = "Number of orders delivered in N days for each distance",
#     x = "Distance (km)",
#     y = "Days to deliver",
#     fill = "Number of\norders"
#   )

Freight ratio and days to delivery

df %>%
  sample_frac(1) %>% 
  select(fr, days_to_deliver, fr_segment) %>% 
  mutate(delivery_times_pctile = ntile(days_to_deliver, 5)) %>% 
  ggbarstats(
   y = fr_segment,
   x = delivery_times_pctile,
   label = "percent",
   results.subtitle = F
  ) +
  scale_fill_brewer(palette = "Dark1", direction = 1, labels = c("Very Slow","Slow","Average","Fast","Very Fast")) +
  theme(axis.text.x = element_text(size = 12)) +
  labs(
    title = "Delivery times for high and low freight ratio",
    x = "Freight ratio segment" 
  ) +
    guides(fill = guide_legend(title = "Delivery time"))  # Change "New Legend Title" to your desired title

Differences in price

Smaller orders have much higher freight ratio, but that is not surprising, and we already knew it.

df %>% 
  sample_frac(1) %>% 
  select(price, fr, fr_segment) %>% 
  filter(fr > 0) %>%
  ggbetweenstats(
    x = fr_segment,
    y = price,
    results.subtitle = F
  ) +
  scale_y_continuous(trans = "log") +
  theme(axis.text.x = element_text(size = 12))

Linear modelling freight ratio - Exploration

d_sample <- df %>% sample_frac(0.05)

# Explore using only on a sample of df_distance to save time
d_sample %>% 
  filter(fr > 0) %>% 
  select(fr, distance_km, product_weight_g, product_volume_cm3, days_to_deliver) %>% 
  na.omit() %>% 
  ggpairs()

d_sample_std <- d_sample %>% 
  mutate_at(c("fr", "distance_km", "product_weight_g", "product_volume_cm3", "days_to_deliver"), .funs=scale)

df2fit <- df %>% 
  select(fr, distance_km, product_weight_g, product_volume_cm3, days_to_deliver, price) %>%
  filter(fr > 0) %>% 
  mutate(log_fr = log(fr))



# Modelling freight_value also gives a good result
fit <- lm(freight_value ~ distance_km + 
                          product_weight_g +
                          product_volume_cm3 +
                          # days_to_deliver + 
                          # fr +
                          log(price), 
                          data = df) 
fit %>% summary
## 
## Call:
## lm(formula = freight_value ~ distance_km + product_weight_g + 
##     product_volume_cm3 + log(price), data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -85.22  -3.07  -0.08   2.48 323.33 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -1.957e-01  1.616e-01  -1.211    0.226    
## distance_km         1.168e-02  7.662e-05 152.403   <2e-16 ***
## product_weight_g    1.479e-03  1.463e-05 101.044   <2e-16 ***
## product_volume_cm3  1.671e-04  2.296e-06  72.801   <2e-16 ***
## log(price)          1.822e+00  3.823e-02  47.650   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.61 on 92378 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.5637, Adjusted R-squared:  0.5637 
## F-statistic: 2.984e+04 on 4 and 92378 DF,  p-value: < 2.2e-16
car::vif(fit)
##        distance_km   product_weight_g product_volume_cm3         log(price) 
##           1.013149           2.959883           2.841904           1.218997
# Modelling freight_ratio
# Price is by far the most important factor (negatively correlated!): 19% of the variance

fit <- lm(fr ~ price, data = df2fit)
summary(fit)
## 
## Call:
## lm(formula = fr ~ price, data = df2fit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23568 -0.08162 -0.02921  0.05242  1.75412 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.412e-01  4.385e-04   550.1   <2e-16 ***
## price       -2.921e-04  1.987e-06  -147.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1112 on 92082 degrees of freedom
## Multiple R-squared:   0.19,  Adjusted R-squared:   0.19 
## F-statistic: 2.16e+04 on 1 and 92082 DF,  p-value: < 2.2e-16
# We get close to 80% of explained variance!.
fit <- lm(fr ~ 
            distance_km +
            product_weight_g +
            # product_volume_cm3 +
            # days_to_deliver +
            log(price),
            data = df2fit) 

fit %>% summary
## 
## Call:
## lm(formula = fr ~ distance_km + product_weight_g + log(price), 
##     data = df2fit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57900 -0.03513 -0.00837  0.02561  0.48585 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.927e-01  9.955e-04   695.8   <2e-16 ***
## distance_km       8.584e-05  4.724e-07   181.7   <2e-16 ***
## product_weight_g  8.575e-06  5.745e-08   149.2   <2e-16 ***
## log(price)       -1.263e-01  2.350e-04  -537.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05917 on 92065 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.7707, Adjusted R-squared:  0.7707 
## F-statistic: 1.031e+05 on 3 and 92065 DF,  p-value: < 2.2e-16
car::vif(fit)
##      distance_km product_weight_g       log(price) 
##         1.013073         1.201499         1.214290
# plot(fit)

Linear modelling freight ratio - Dig deeper

# df2fit <- df %>% 
#   select(fr, distance_km, product_weight_g, product_volume_cm3, days_to_deliver, price) %>%
#   filter(fr > 0) %>% 
#   mutate(log_fr = log(fr))


# We get > 80% of explained variance!.
fit <- lm(fr ~ 
            log(distance_km + 1) +
            product_weight_g +
            product_volume_cm3 +
            # log(days_to_deliver + 1) +
            log(price),
            data = df2fit) 

fit %>% summary
## 
## Call:
## lm(formula = fr ~ log(distance_km + 1) + product_weight_g + product_volume_cm3 + 
##     log(price), data = df2fit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48202 -0.03531 -0.01087  0.02513  0.46884 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.832e-01  1.217e-03  479.28   <2e-16 ***
## log(distance_km + 1)  2.721e-02  1.482e-04  183.54   <2e-16 ***
## product_weight_g      6.662e-06  8.973e-08   74.24   <2e-16 ***
## product_volume_cm3    3.682e-07  1.408e-08   26.15   <2e-16 ***
## log(price)           -1.271e-01  2.347e-04 -541.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05889 on 92064 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.7729, Adjusted R-squared:  0.7729 
## F-statistic: 7.832e+04 on 4 and 92064 DF,  p-value: < 2.2e-16
car::vif(fit)
## log(distance_km + 1)     product_weight_g   product_volume_cm3 
##             1.015760             2.958916             2.843141 
##           log(price) 
##             1.222443
# plot(fit)
# Let's make a df with the log of the values we are interested and explore
# the correlations

df2fit <- df %>%
  filter(fr > 0, distance_km > 0, price > 0, price < 2000) %>% 
  mutate(log_fr = log(fr), log_distance_km = log(distance_km), log_price = log(price), fr_pct = fr*100) %>% 
  select(fr, fr_pct, freight_value, price, distance_km, log_fr, log_price, log_distance_km, product_weight_g, product_volume_cm3) %>% 
  filter(log_price > 0, log_distance_km > 0)
  
fit <- lm(
  fr ~ 
    log_price +
    distance_km +
    product_weight_g +
    product_volume_cm3,
  data = df2fit
)  

# log_price explains ~ 64% of the variance in fr
fit <- lm(fr ~ log_price, data = df2fit)

# distance_km explains about 3.5% of the variance
fit <- lm(fr ~ distance_km, data = df2fit)

# together they explain ~ 72% of the variance
fit <- lm(fr ~ log_price + I(distance_km^(0.5)), data = df2fit)

# adding weight and volume explains an additional 5%, to 77% of the variance
# however this leads to a certain extent of violation of lm assumptions

# the best model is obtained with log(1/price) and the sqrt(distance_km) 78% of the variance
fit <- lm(fr ~ log(I(1/log_price)) + I(distance_km^(0.5)), data = df2fit)

# however the most important evidence is that fr is predicted almost entirely from price
# wrt other expected variables, such as distance, weight and volume, so we will use the 
# simplest model:
fit <- lm(fr ~ log_price, data = df2fit)

fit %>% summary
## 
## Call:
## lm(formula = fr ~ log_price, data = df2fit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42723 -0.04452 -0.01306  0.03381  0.51545 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.6779265  0.0011887   570.3   <2e-16 ***
## log_price   -0.1090671  0.0002689  -405.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07388 on 91870 degrees of freedom
## Multiple R-squared:  0.6417, Adjusted R-squared:  0.6417 
## F-statistic: 1.645e+05 on 1 and 91870 DF,  p-value: < 2.2e-16
# car::vif(fit)
# plot(fit)

Now freight_value

df2fit <- df %>%
  filter(fr > 0, freight_value > 0, distance_km > 0, price > 0, price < 2000) %>%
  mutate(log_fr = log(fr), log_freight_value = log(freight_value),
         log_distance_km = log(distance_km), log_price = log(price)) %>%
  select(fr, freight_value, log_freight_value, price, distance_km, 
         log_fr, log_price, log_distance_km, product_weight_g, product_volume_cm3) %>%
  filter(log_price > 0, log_distance_km > 0)
  

# Modelling freight_value also gives a good result
fit <- lm(freight_value ~ distance_km + 
                          product_weight_g +
                          product_volume_cm3 +
                          log(price), 
                          data = df2fit) 

fit %>% summary
## 
## Call:
## lm(formula = freight_value ~ distance_km + product_weight_g + 
##     product_volume_cm3 + log(price), data = df2fit)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -80.36  -3.02  -0.14   2.38 324.65 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.263e-01  1.575e-01   2.706  0.00681 ** 
## distance_km        1.174e-02  7.430e-05 157.961  < 2e-16 ***
## product_weight_g   1.491e-03  1.421e-05 104.931  < 2e-16 ***
## product_volume_cm3 1.597e-04  2.233e-06  71.485  < 2e-16 ***
## log(price)         1.696e+00  3.729e-02  45.487  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.292 on 91852 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.5712, Adjusted R-squared:  0.5711 
## F-statistic: 3.058e+04 on 4 and 91852 DF,  p-value: < 2.2e-16
car::vif(fit)
##        distance_km   product_weight_g product_volume_cm3         log(price) 
##           1.013187           2.941070           2.828425           1.215112
# note also the positive association between price and dimensions 
lm(price ~ product_volume_cm3 + product_weight_g, data = df2fit) %>% summary
## 
## Call:
## lm(formula = price ~ product_volume_cm3 + product_weight_g, data = df2fit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.03  -61.75  -36.65   13.72 1910.01 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        8.335e+01  5.791e-01  143.92   <2e-16 ***
## product_volume_cm3 6.445e-04  3.518e-05   18.32   <2e-16 ***
## product_weight_g   1.210e-02  2.195e-04   55.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 146.7 on 91854 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.1328, Adjusted R-squared:  0.1328 
## F-statistic:  7036 on 2 and 91854 DF,  p-value: < 2.2e-16
fit %>% summary
## 
## Call:
## lm(formula = freight_value ~ distance_km + product_weight_g + 
##     product_volume_cm3 + log(price), data = df2fit)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -80.36  -3.02  -0.14   2.38 324.65 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.263e-01  1.575e-01   2.706  0.00681 ** 
## distance_km        1.174e-02  7.430e-05 157.961  < 2e-16 ***
## product_weight_g   1.491e-03  1.421e-05 104.931  < 2e-16 ***
## product_volume_cm3 1.597e-04  2.233e-06  71.485  < 2e-16 ***
## log(price)         1.696e+00  3.729e-02  45.487  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.292 on 91852 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.5712, Adjusted R-squared:  0.5711 
## F-statistic: 3.058e+04 on 4 and 91852 DF,  p-value: < 2.2e-16
# car::vif(fit)
# plot(fit)


# freight ratio ggpairs
df2fit %>% 
  sample_frac(0.01) %>% 
  select(log_fr, log_price, distance_km, product_volume_cm3, product_weight_g) %>% 
  ggpairs() +
  theme_minimal() +
  ggtitle("Freight Ratio associations")

# freight value ggpairs
df2fit %>% 
  sample_frac(0.01) %>% 
  select(log_freight_value, log_price, distance_km, product_volume_cm3, product_weight_g) %>% 
  ggpairs() +
  theme_minimal() +
  ggtitle("Freight Value associations")

lm(fr ~ poly(distance_km,2), data = df2fit) %>% summary
## 
## Call:
## lm(formula = fr ~ poly(distance_km, 2), data = df2fit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25906 -0.08798 -0.02384  0.06355  0.69411 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            0.2060344  0.0003991  516.27   <2e-16 ***
## poly(distance_km, 2)1  7.2081861  0.1209629   59.59   <2e-16 ***
## poly(distance_km, 2)2 -1.7793287  0.1209629  -14.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.121 on 91869 degrees of freedom
## Multiple R-squared:  0.03939,    Adjusted R-squared:  0.03937 
## F-statistic:  1884 on 2 and 91869 DF,  p-value: < 2.2e-16

Garbage collector

Other non-numeric variables

Suggestion: try doing the heatmaps using reactable